{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "59ab437b",
   "metadata": {},
   "source": [
    "Standardize and sample tiktok data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93af1e0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "creator_data = pd.read_excel('creator data labeled.xlsx')\n",
    "\n",
    "tiktok_data = pd.read_csv('tiktok_consolidated_data.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00c860a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(creator_data.info())\n",
    "print(tiktok_data.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c151cb6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Convert names to a consistent format across sets; check matches between creators in the sets; print non-matches\n",
    "\n",
    "creator_data['Normalized Name'] = creator_data['Display Name'].str.lower()\n",
    "tiktok_data['Normalized Name'] = tiktok_data['user'].str.lower()\n",
    "\n",
    "# Find unique names in both datasets to reduce the number of comparisons\n",
    "unique_creator_names = set(creator_data['Normalized Name'].unique())\n",
    "unique_tiktok_names = set(tiktok_data['Normalized Name'].unique())\n",
    "\n",
    "# Identify names in creator_data not found in tiktok_data\n",
    "unmatched_creators = unique_creator_names - unique_tiktok_names\n",
    "\n",
    "# Print the unmatched creator names\n",
    "print(\"Unmatched creator names:\")\n",
    "for name in unmatched_creators:\n",
    "    print(name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7e4d7c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter out non english creators; identify missing matches, where creators cannot be matched by both URL and display name\n",
    "filtered_creator_data = creator_data[creator_data['English'] != 'x']\n",
    "\n",
    "creator_urls = filtered_creator_data['URL'].tolist()\n",
    "creator_names = filtered_creator_data['Display Name'].tolist()\n",
    "\n",
    "tiktok_urls = tiktok_data['user_url'].unique().tolist()\n",
    "tiktok_names = tiktok_data['user'].unique().tolist()\n",
    "\n",
    "matched_by_url = set(creator_urls) & set(tiktok_urls)\n",
    "matched_by_name = set(creator_names) & set(tiktok_names)\n",
    "\n",
    "matched_by_either = matched_by_url.union(matched_by_name)\n",
    "\n",
    "unmatched_creators = filtered_creator_data[\n",
    "    (~filtered_creator_data['URL'].isin(matched_by_url)) &\n",
    "    (~filtered_creator_data['Display Name'].isin(matched_by_name))\n",
    "]\n",
    "\n",
    "unmatched_creators_list = unmatched_creators[['Display Name', 'URL']]\n",
    "\n",
    "print(unmatched_creators_list)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d81c3136",
   "metadata": {},
   "outputs": [],
   "source": [
    "# This code updates the 'user' column in 'tiktok_data' to match the 'Display Name' from 'filtered_creator_data'\n",
    "# where a corresponding 'URL'/'user_url' match is found.\n",
    "\n",
    "for index, row in filtered_creator_data.iterrows():\n",
    "    display_name = row['Display Name']\n",
    "    creator_url = row['URL']\n",
    "\n",
    "    if display_name not in tiktok_data['user'].unique():\n",
    "        mask = tiktok_data['user_url'] == creator_url\n",
    "        if mask.any():\n",
    "            tiktok_data.loc[mask, 'user'] = display_name\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "967a9760",
   "metadata": {},
   "outputs": [],
   "source": [
    "#check for name discrepancies after further standardization\n",
    "\n",
    "\n",
    "creator_names = set(filtered_creator_data['Display Name'].unique())\n",
    "tiktok_user_names = set(tiktok_data['user'].unique())\n",
    "\n",
    "\n",
    "unmatched_creators = creator_names - tiktok_user_names\n",
    "unmatched_users = tiktok_user_names - creator_names\n",
    "\n",
    "print(\"Unmatched Creator Names in 'filtered_creator_data':\")\n",
    "for name in unmatched_creators:\n",
    "    print(name)\n",
    "\n",
    "print(\"\\nUnmatched User Names in 'tiktok_data':\")\n",
    "for name in unmatched_users:\n",
    "    print(name)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f696c5b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "#manually standardize final names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9bfd0f80",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Update select names in tiktok_data to ensure consistency\n",
    "tiktok_data.loc[tiktok_data['user'] == 'Naimdarrechi', 'user'] = 'Naim Darrechi'\n",
    "tiktok_data.loc[tiktok_data['user'] == 'jules', 'user'] = 'Jules'\n",
    "\n",
    "# Step 2: Delete rows in filtered_creator_data for \"Amanda\" and \"Awez Darbar\"\n",
    "filtered_creator_data = filtered_creator_data[~filtered_creator_data['Display Name'].isin(['Amanda', 'Awez Darbar'])]\n",
    "\n",
    "# Recheck for unmatched names after the updates\n",
    "\n",
    "creator_names_updated = set(filtered_creator_data['Display Name'].unique())\n",
    "tiktok_user_names_updated = set(tiktok_data['user'].unique())\n",
    "\n",
    "unmatched_creators_updated = creator_names_updated - tiktok_user_names_updated\n",
    "unmatched_users_updated = tiktok_user_names_updated - creator_names_updated\n",
    "\n",
    "print(\"Unmatched Creator Names in 'filtered_creator_data' after updates:\")\n",
    "for name in unmatched_creators_updated:\n",
    "    print(name)\n",
    "\n",
    "print(\"\\nUnmatched User Names in 'tiktok_data' after updates:\")\n",
    "for name in unmatched_users_updated:\n",
    "    print(name)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bcdd47a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "#now delete extra users in thetiktok data\n",
    "\n",
    "unmatched_users = set(tiktok_data['user']) - set(filtered_creator_data['Display Name'])\n",
    "\n",
    "print(\"Users in 'tiktok_data' without a matching 'Display Name' in 'filtered_creator_data':\")\n",
    "for user in unmatched_users:\n",
    "    print(user)\n",
    "\n",
    "tiktok_data_filtered = tiktok_data[~tiktok_data['user'].isin(unmatched_users)]\n",
    "\n",
    "# tiktok_data_filtered now contains only rows for users with a match in filtered_creator_data\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "864901f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "#filter out non english creators\n",
    "non_english_creators = filtered_creator_data[filtered_creator_data['English'] != 1]['Display Name']\n",
    "\n",
    "tiktok_data_filtered = tiktok_data_filtered[~tiktok_data_filtered['user'].isin(non_english_creators)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "947e87e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "filtered_creator_data.reset_index(drop=True, inplace=True)\n",
    "tiktok_data_filtered.reset_index(drop=True, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e77f758a",
   "metadata": {},
   "outputs": [],
   "source": [
    "#filter data by years on top creator list\n",
    "\n",
    "def find_years_in_dates(date_entry):\n",
    "    years_to_find = ['2020', '2021', '2022', '2023']\n",
    "    found_years = []\n",
    "\n",
    "    date_entry_str = str(date_entry)\n",
    "    \n",
    "    for year in years_to_find:\n",
    "        if year in date_entry_str:\n",
    "            found_years.append(int(year))  \n",
    "    \n",
    "    return found_years\n",
    "\n",
    "filtered_creator_data['Years'] = filtered_creator_data['Dates'].apply(find_years_in_dates)\n",
    "\n",
    "filtered_creator_data[['Display Name', 'Dates', 'Years']]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f149293c",
   "metadata": {},
   "outputs": [],
   "source": [
    "#for other set\n",
    "\n",
    "def find_year_in_date(date_entry):\n",
    "    # Define the years we're looking for as strings\n",
    "    years_to_find = ['2020', '2021', '2022', '2023']\n",
    "    date_entry_str = str(date_entry)\n",
    "    for year in years_to_find:\n",
    "        if year in date_entry_str:\n",
    "            return int(year)\n",
    "    return None\n",
    "\n",
    "tiktok_data_filtered['Year'] = tiktok_data_filtered['date'].apply(find_year_in_date)\n",
    "\n",
    "tiktok_data_filtered.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56536b22",
   "metadata": {},
   "outputs": [],
   "source": [
    "#filter\n",
    "for index, creator_row in filtered_creator_data.iterrows():\n",
    "    creator_name = creator_row['Display Name']\n",
    "    active_years = creator_row['Years']  \n",
    "\n",
    "    creator_videos = tiktok_data_filtered[tiktok_data_filtered['user'] == creator_name]\n",
    "    \n",
    "    videos_to_keep = []\n",
    "    for video_index, video_row in creator_videos.iterrows():\n",
    "        video_year = video_row['Year']\n",
    "        if video_year in active_years:\n",
    "            videos_to_keep.append(video_index)\n",
    "\n",
    "    tiktok_data_filtered = tiktok_data_filtered.drop(creator_videos.index.difference(videos_to_keep))\n",
    "\n",
    "tiktok_data_filtered.reset_index(drop=True, inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c8d85ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "tiktok_data_filtered.to_csv(\"tiktok_data_onlyenglish_onlyyearsonlist.csv\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d9509a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "#count vids per creator; sort\n",
    "videos_per_creator = tiktok_data_filtered.groupby('user').size()\n",
    "\n",
    "sorted_videos_per_creator = videos_per_creator.sort_values()\n",
    "\n",
    "print(sorted_videos_per_creator)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33d237f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "#filter creators with too few videos\n",
    "\n",
    "total_creators = tiktok_data_filtered['user'].nunique()\n",
    "print(f\"Total number of creators: {total_creators}\")\n",
    "\n",
    "videos_per_creator = tiktok_data_filtered.groupby('user').size().sort_values()\n",
    "\n",
    "creators_to_remove = videos_per_creator.index[:2]\n",
    "\n",
    "tiktok_data_filtered = tiktok_data_filtered[~tiktok_data_filtered['user'].isin(creators_to_remove)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4682243",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# randomly select 28 videos per creator\n",
    "sampled_videos = tiktok_data_filtered.groupby('user').apply(lambda x: x.sample(n=min(len(x), 28), random_state=1)).reset_index(drop=True)\n",
    "\n",
    "sampled_videos.to_csv(\"tiktok_allenglish_onlytoplistyears_equalvideospercreator28.csv\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2289b6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# randomly select 10 videos per creator\n",
    "sample1 = tiktok_data_filtered.groupby('user').apply(lambda x: x.sample(n=min(len(x), 10), random_state=1)).reset_index(drop=True)\n",
    "\n",
    "sample1 = sample1.sample(frac=1, random_state=1).reset_index(drop=True)\n",
    "\n",
    "sample1.to_csv(\"tiktok_sample1_tenpercreator.csv\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47f7b125",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "df = pd.read_csv(\"tiktok_allenglish_onlytoplistyears_equalvideospercreator28.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59d3ef42",
   "metadata": {},
   "outputs": [],
   "source": [
    "#sample 5 per\n",
    "sampled_df = df.groupby('user').sample(n=5)\n",
    "\n",
    "shuffled_df = sampled_df.sample(frac=1).reset_index(drop=True)\n",
    "\n",
    "# Save the shuffled data to the same CSV file, overwriting the previous data\n",
    "shuffled_df.to_csv(\"tiktok_randomsample_2.csv\", index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
